### R Script for Table 2 ###

library(dplyr)
library(haven)
library(readxl)
library(tidyr)
library(tidytext)
library(writexl)

# CHANGE THIS

setwd("/path/to/directory")

# reading in the data
mydf1 <- read_dta("data_src_VAA_comments_for_textanalysis.dta")

# adding a column called 'rural'
mydf2 <- mydf1 %>% mutate(rural = if_else(urban2017 < 60 & population2017 < 15000, 1, 0))

# creating a binary variable for being favorable/unfavorable about refugees -> 'refugee_position'
mydictrefugpos <- c(
"jokseenkin samaa mieltä" = 1,
"täysin samaa mieltä"     = 1,    
"täysin eri mieltä"       = 0,
"jokseenkin eri mieltä"   = 0,
"ohita kysymys"           = 0
)

# adding the column 'refugee_position'
mydf3 <- mydf2 %>% mutate(refugee_position = mydictrefugpos[refugees])

# only keeping the candidates who have a favorable position on refugee intake and who have left a comment
# starting dataframe
mydfstart <- mydf3[mydf3$refugee_position == 1 & mydf3$refugee_comment != "", ]

# word mining

# only keeping the necessary columns
mydfwm1 <- mydfstart[, c('id','refugee_comment')]

# need to add a space after '.' and ',' (otherwise problems with unnest_tokens)
mydfwm1$refugee_comment <- gsub('([.,:;])([^ ])', '\\1 \\2', mydfwm1$refugee_comment)

# one word per line (beware: the hyphen "-" separates words)
mydfwm2 <- mydfwm1 %>% unnest_tokens(word, refugee_comment)

# list of words
mywords <- as.data.frame(mydfwm2$word %>% unique())
colnames(mywords) <- c('word')

# exporting to an Excel file
write_xlsx(mywords,"data_mywords.xlsx")

# In this file the author manually adds a column that gives for each word its category
# The words without a category are dropped
# This Excel file is called "myword2category.xlsx" (Provided in the folder for replication purposes)

myw2c <- read_excel("data_myword2category.xlsx")

# transposing
myw2ct <- (myw2c 
%>% pivot_longer(cols = -1) 
%>% pivot_wider(names_from = 'word', values_from = 'value') 
%>% rename(word = name)
)

# dictionary: 'word' -> 'category'
mydictword2category <- as.list(myw2ct)

# adding column 'categ'
mydfwm3 <- mydfwm2 %>% mutate(categ = mydictword2category[word])

# add a column to count each 'categ'
mydfwm4 <- (mydfwm3 
%>% mutate(dutyhelp   = if_else(categ == 'DUTY&HELP',  1, 0))
%>% mutate(experience = if_else(categ == 'EXPERIENCE', 1, 0))
%>% mutate(jobs       = if_else(categ == 'JOBS',       1, 0))
%>% mutate(population = if_else(categ == 'POPULATION', 1, 0))
)

# only keeping the columns needed for aggregate analysis
mydfwm5 <- mydfwm4[, c('id','dutyhelp','experience','jobs','population')]

# the columns indicate by 1 or 0 if the 'categ' is mentioned or not in the response
mydfwmfinal <- aggregate(mydfwm5, by = list(mydfwm5$id), FUN='max')

# merging
mydffinal <- left_join(mydfstart, mydfwmfinal, by='id') %>% rename(id2 = Group.1)

# you can export this to inspect what is looks like
write_xlsx(mydffinal, "data_mydffinal.xlsx")

# only keeping the numerical columns that are necessary
mydffinalshort <- mydffinal[, c('Treated2015','rural','dutyhelp','experience','jobs','population')]

##results for Table 2##

# Getting the 4 groupsand the number of candidates in each
mytable <- table(mydffinalshort$Treated2015, mydffinalshort$rural)
names(dimnames(mytable)) <- c('treated','rural')

#a count of the total of unique mentions of each category in each group

mytabletreatedrural <- (aggregate(mydffinalshort,
by = list(mydffinalshort$Treated2015, mydffinalshort$rural),
FUN = 'sum')
%>% rename('treated?' = Group.1, 'rural?' = Group.2)
)

#getting the percentages of mentions for each category in each group. These are the values in Table 2

mytabletreatedruralpercent <- (aggregate(mydffinalshort,
by = list(mydffinalshort$Treated2015, mydffinalshort$rural),
FUN = function(x) round(mean(x)*100, 2))
%>% rename('treated?' = Group.1, 'rural?' = Group.2)
)

# exporting
write_xlsx(as.data.frame(mytable),     "data_mytable.xlsx")
write_xlsx(mytabletreatedrural,        "data_mytabletreatedrural.xlsx")
write_xlsx(mytabletreatedruralpercent, "data_mytabletreatedruralpercent.xlsx")

# END
